# mpg data frame is a rectangular collection (fields and records)
# found in (ggplot2::mpg)
# displ -  car’s engine size, in liters
# hwy - a car’s fuel efficiency on the highway
mpg
## # A tibble: 234 x 11
##    manufacturer model displ  year   cyl trans drv     cty   hwy fl    class
##    <chr>        <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
##  1 audi         a4      1.8  1999     4 auto… f        18    29 p     comp…
##  2 audi         a4      1.8  1999     4 manu… f        21    29 p     comp…
##  3 audi         a4      2    2008     4 manu… f        20    31 p     comp…
##  4 audi         a4      2    2008     4 auto… f        21    30 p     comp…
##  5 audi         a4      2.8  1999     6 auto… f        16    26 p     comp…
##  6 audi         a4      2.8  1999     6 manu… f        18    26 p     comp…
##  7 audi         a4      3.1  2008     6 auto… f        18    27 p     comp…
##  8 audi         a4 q…   1.8  1999     4 manu… 4        18    26 p     comp…
##  9 audi         a4 q…   1.8  1999     4 auto… 4        16    25 p     comp…
## 10 audi         a4 q…   2    2008     4 manu… 4        20    28 p     comp…
## # … with 224 more rows
# Help
help(mpg)
# Creating a ggplot2
## Function geom_point() adds a layer of points to your plot,
## which creates a scatterplot
ggplot(data=mpg) + geom_point(mapping=aes(x=displ,y=hwy))

# Graphic Template
# ggplot(data=<DATA>) + <GEOM_FUNCTION>(mapping=aes(<MAPPINGS>))

ggplot(data = mpg)

# Import mtcars dataset
mtcars <- dput(mtcars)
## structure(list(mpg = c(21, 21, 22.8, 21.4, 18.7, 18.1, 14.3, 
## 24.4, 22.8, 19.2, 17.8, 16.4, 17.3, 15.2, 10.4, 10.4, 14.7, 32.4, 
## 30.4, 33.9, 21.5, 15.5, 15.2, 13.3, 19.2, 27.3, 26, 30.4, 15.8, 
## 19.7, 15, 21.4), cyl = c(6, 6, 4, 6, 8, 6, 8, 4, 4, 6, 6, 8, 
## 8, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 8, 6, 8, 4), 
##     disp = c(160, 160, 108, 258, 360, 225, 360, 146.7, 140.8, 
##     167.6, 167.6, 275.8, 275.8, 275.8, 472, 460, 440, 78.7, 75.7, 
##     71.1, 120.1, 318, 304, 350, 400, 79, 120.3, 95.1, 351, 145, 
##     301, 121), hp = c(110, 110, 93, 110, 175, 105, 245, 62, 95, 
##     123, 123, 180, 180, 180, 205, 215, 230, 66, 52, 65, 97, 150, 
##     150, 245, 175, 66, 91, 113, 264, 175, 335, 109), drat = c(3.9, 
##     3.9, 3.85, 3.08, 3.15, 2.76, 3.21, 3.69, 3.92, 3.92, 3.92, 
##     3.07, 3.07, 3.07, 2.93, 3, 3.23, 4.08, 4.93, 4.22, 3.7, 2.76, 
##     3.15, 3.73, 3.08, 4.08, 4.43, 3.77, 4.22, 3.62, 3.54, 4.11
##     ), wt = c(2.62, 2.875, 2.32, 3.215, 3.44, 3.46, 3.57, 3.19, 
##     3.15, 3.44, 3.44, 4.07, 3.73, 3.78, 5.25, 5.424, 5.345, 2.2, 
##     1.615, 1.835, 2.465, 3.52, 3.435, 3.84, 3.845, 1.935, 2.14, 
##     1.513, 3.17, 2.77, 3.57, 2.78), qsec = c(16.46, 17.02, 18.61, 
##     19.44, 17.02, 20.22, 15.84, 20, 22.9, 18.3, 18.9, 17.4, 17.6, 
##     18, 17.98, 17.82, 17.42, 19.47, 18.52, 19.9, 20.01, 16.87, 
##     17.3, 15.41, 17.05, 18.9, 16.7, 16.9, 14.5, 15.5, 14.6, 18.6
##     ), vs = c(0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 
##     0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1), am = c(1, 
##     1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 
##     0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1), gear = c(4, 4, 4, 3, 
##     3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 
##     3, 3, 4, 5, 5, 5, 5, 5, 4), carb = c(4, 4, 1, 1, 2, 1, 4, 
##     2, 2, 4, 4, 3, 3, 3, 4, 4, 4, 1, 2, 1, 1, 2, 2, 4, 2, 1, 
##     2, 2, 4, 6, 8, 2)), row.names = c("Mazda RX4", "Mazda RX4 Wag", 
## "Datsun 710", "Hornet 4 Drive", "Hornet Sportabout", "Valiant", 
## "Duster 360", "Merc 240D", "Merc 230", "Merc 280", "Merc 280C", 
## "Merc 450SE", "Merc 450SL", "Merc 450SLC", "Cadillac Fleetwood", 
## "Lincoln Continental", "Chrysler Imperial", "Fiat 128", "Honda Civic", 
## "Toyota Corolla", "Toyota Corona", "Dodge Challenger", "AMC Javelin", 
## "Camaro Z28", "Pontiac Firebird", "Fiat X1-9", "Porsche 914-2", 
## "Lotus Europa", "Ford Pantera L", "Ferrari Dino", "Maserati Bora", 
## "Volvo 142E"), class = "data.frame")
nrow(mtcars)
## [1] 32
ncol(mtcars)
## [1] 11
ggplot(data=mpg) + geom_point(mapping=aes(x=cyl,y=hwy))

# Why is it not an useful graph?
ggplot(data=mpg) + geom_point(mapping=aes(x=class,y=drv))

# Color advised for discrete variables
p = ggplot(data = mpg)
p + geom_point(mapping = aes(x = displ, y = hwy, color = class))

# Warning: Using size for a discrete variable is not advised.
# Size is advised for continuous variables
ggplot(data = mpg) +
      geom_point(mapping = aes(x = displ, y = hwy, size = class))
## Warning: Using size for a discrete variable is not advised.

# Top
    ggplot(data = mpg) +
      geom_point(mapping = aes(x = displ, y = hwy, alpha = class))
## Warning: Using alpha for a discrete variable is not advised.

# Bottom
    ggplot(data = mpg) +
      geom_point(mapping = aes(x = displ, y = hwy, shape = class))
## Warning: The shape palette can deal with a maximum of 6 discrete values
## because more than 6 becomes difficult to discriminate; you have 7.
## Consider specifying shapes manually if you must have them.
## Warning: Removed 62 rows containing missing values (geom_point).

# Make all plotted points blue
# stroke aesthetic
ggplot(data = mpg) +
      geom_point(mapping = aes(x = displ, y = hwy), color = "blue", stroke=5)

# Facets: subplots that each display one subset of the data
ggplot(data=mpg) +
      geom_point(mapping = aes(x=displ, y=hwy)) +
      facet_wrap(~class, nrow=2)

# Facet the plot on the combination of two variables
# Generally those variables are discrete (drv and cyl in this case)
ggplot(data = mpg) +
      geom_point(mapping = aes(x = displ, y = hwy)) +
      facet_grid(drv ~ cyl)

ggplot(data = mpg) +
          geom_point(mapping = aes(x = displ, y = hwy)) +
          facet_grid(drv ~ .)

ggplot(data = mpg) +
          geom_point(mapping = aes(x = displ, y = hwy)) +
          facet_grid(. ~ cyl)

ggplot(data = mpg) +
          geom_point(mapping = aes(x = displ, y = hwy)) +
          facet_wrap(~ class, nrow = 2)

Geometric Objects

# For example, bar charts use bar geoms, line charts use line geoms, boxplots use boxplot geoms, and so on. Scatterplots break the trend; they use the point geom.

# left
ggplot(data = mpg) +
  geom_point(mapping = aes(x = displ, y = hwy))

# right
ggplot(data = mpg) +
  geom_smooth(mapping = aes(x = displ, y = hwy))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

# Different linetype, for each unique value of the variable
ggplot(data = mpg) +
      geom_smooth(mapping = aes(x = displ, y = hwy, linetype = drv))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

# ggplot2 provides over 30 geoms
ggplot(data=mpg) +
    geom_smooth(mapping=aes(x=displ, y=hwy))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

ggplot(data=mpg) +
  geom_smooth(mapping=aes(x = displ, y=hwy, group=drv))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

ggplot(data=mpg) +
  geom_smooth(mapping=aes(x=displ, y=hwy, color=drv), show.legend=TRUE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

# Display multiple geoms in the same plot
ggplot(data = mpg) +
      geom_point(mapping = aes(x = displ, y = hwy)) +
      geom_smooth(mapping = aes(x = displ, y = hwy))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

# Imagine if you wanted to change the y-axis to display cty instead of hwy. You’d need to change the variable in two places, and you might forget to update one. You can avoid this type of repetition by passing a set of mappings to ggplot().

# Global mapping, by mapping in ggplot(...) method

ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) +
  geom_point() + 
  geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

# Local mappings to extend or overwrite the global mappings for that layer only.
ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) +
      geom_point(mapping = aes(color = class)) +
      geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

# Smooth line displays just a subset of the mpg dataset
ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) +
      geom_point(mapping = aes(color = class)) +
      geom_smooth(
        data = filter(mpg, class == "subcompact"), se = FALSE )
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

# Plot exactly the same
 ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) +
         geom_point() +
         geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

       ggplot() +
         geom_point(
data = mpg,
mapping = aes(x = displ, y = hwy) )+
         geom_smooth(
           data = mpg,
           mapping = aes(x = displ, y = hwy)
)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Statistical Transformations

stat: statistical transformation

# Diamonds dataset comes in ggplot2 and contains information about ~54,000 diamonds, including the price, carat, color, clarity, and cut of each diamond
# Bar charts seem simple, but they are interesting because they reveal something subtle about plots.
ggplot(data = diamonds) +
      geom_bar(mapping = aes(x = cut))

#you can re-create the previous plot using stat_count() instead of geom_bar():
ggplot(data = diamonds) +
      stat_count(mapping = aes(x = cut))

# Deep a bit in how bar charts are built
demo <- tribble(
          ~a,      ~b,
          "bar_1", 20,
          "bar_2", 30,
          "bar_3", 40
)
ggplot(data = demo) +
  geom_bar(
    mapping = aes(x = a, y = b), stat = "identity"
  )

# Proportion bar chart
ggplot(data = diamonds) +
  geom_bar(mapping = aes(x = cut, y = ..prop.., group = 1))

# Summarizes the y values for each unique x value
ggplot(data = diamonds) +
  stat_summary(
    mapping = aes(x = cut, y = depth),
    fun.ymin = min,
    fun.ymax = max,
    fun.y = median
  )

# WITHOUT GROUP
ggplot(data = diamonds) +
  geom_bar(mapping = aes(x = cut, y = ..prop..))

ggplot(data = diamonds) +
  geom_bar(mapping = aes(x = cut, fill = color, y = ..prop..))

# WITH GROUP
ggplot(data = diamonds) +
  geom_bar(mapping = aes(x = cut, y = ..prop.., group = 1))

ggplot(data = diamonds) +
  geom_bar(mapping = aes(x = cut, fill = color, y = ..prop.., group = 1))

Position Adjustments

ggplot(data = diamonds ) +
  geom_bar(mapping = aes(x = cut, color = cut))

ggplot(data = diamonds) + 
  geom_bar(mapping = aes(x = cut, fill = cut))

# Map the fill aesthetic to another vari‐ able, like clarity: the bars are automatically stacked.
ggplot(data = diamonds) +
      geom_bar(mapping = aes(x = cut, fill = clarity))

# position = "identity" will place each object exactly where it falls in the context of the graph. It's more useful for 2D geoms, like points, where it is the default.
ggplot(
  data = diamonds,
  mapping = aes(x = cut, fill = clarity)
) +
geom_bar(alpha = 1/5, position = "identity")

ggplot(
  data = diamonds,
  mapping = aes(x = cut, color = clarity)
  ) +
geom_bar(fill = NA, position = "identity")

# position = "fill"; Useful to compare proportions
ggplot(data = diamonds) +
  geom_bar(
    mapping = aes(x = cut, fill = clarity),
    position = "fill"
  )

# position = "dodge"; It Makes it easier to compare individual values
ggplot(data = diamonds) +
  geom_bar(
    mapping = aes(x = cut, fill = clarity),
    position = "dodge"
  )

# position = "jitter"; It adds a small amount of random noise to each point. Let us see where there is more density.
ggplot(data = mpg) +
  geom_point(
    mapping = aes(x = displ, y = hwy),
    position = "jitter"
  )

# geom_jitter() - it adds a small amount of random variation to the location of each point
ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) +
  geom_point() + geom_jitter()

# geom_count() - it counts the number of observations at each location; useful for discrete data and overplotting
ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) +
  geom_point() + geom_count()

Coordinate System

# Vertical box plot
ggplot(data = mpg, mapping = aes(x = class, y = hwy)) +
  geom_boxplot()

# Horizontal box plot through axis flipping
ggplot(data = mpg, mapping = aes(x = class, y = hwy)) +
  geom_boxplot() +
  coord_flip()

# Print a map through coordinates
nz <- map_data("nz")
ggplot(nz, aes(long, lat, group = group)) +
  geom_polygon(fill = "white", color = "black")

# coord_quickmap() - sets the aspect ratio correctly for maps
ggplot(nz, aes(long, lat, group = group)) +
  geom_polygon(fill = "white", color = "black") +
  coord_quickmap()

# labs - Modify axis, legend, and plot labels
bar <- ggplot(data = diamonds) +
  geom_bar(
    mapping = aes(x = cut, fill = cut), show.legend = FALSE,
    width = 1
    ) +
  theme(aspect.ratio = 1) + 
  labs(x = NULL, y = NULL)

bar + coord_flip()

bar + coord_polar()

# coord_fixed() - forces a specific ratio between the physical representation of data units on the axes
# geom_abline() - Add a reference line specified by the slope and intercept
ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) +
  geom_point() +
  geom_abline() +
  coord_fixed()

The Layered Grammar of Graphics

# Function Template

# ggplot(data = <DATA>) +
#       <GEOM_FUNCTION>(
#          mapping = aes(<MAPPINGS>),
#          stat = <STAT>,
#          position = <POSITION>
# )+ <COORDINATE_FUNCTION> + <FACET_FUNCTION>
# Read the documentation using the ?<expression>
?mpg
# List the variables
names(mpg)
##  [1] "manufacturer" "model"        "displ"        "year"        
##  [5] "cyl"          "trans"        "drv"          "cty"         
##  [9] "hwy"          "fl"           "class"
# List the dimensions
dim(mpg)
## [1] 234  11
# Class of the object
class(mpg)
## [1] "tbl_df"     "tbl"        "data.frame"
# print first 2 rows of the dataframe
head(mpg,2)
## # A tibble: 2 x 11
##   manufacturer model displ  year   cyl trans  drv     cty   hwy fl    class
##   <chr>        <chr> <dbl> <int> <int> <chr>  <chr> <int> <int> <chr> <chr>
## 1 audi         a4      1.8  1999     4 auto(… f        18    29 p     comp…
## 2 audi         a4      1.8  1999     4 manua… f        21    29 p     comp…
# print last 2 rows of the dataframe
tail(mpg,2)
## # A tibble: 2 x 11
##   manufacturer model  displ  year   cyl trans drv     cty   hwy fl    class
##   <chr>        <chr>  <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
## 1 volkswagen   passat   2.8  1999     6 manu… f        18    26 p     mids…
## 2 volkswagen   passat   3.6  2008     6 auto… f        17    26 p     mids…
summary(mpg)
##  manufacturer          model               displ            year     
##  Length:234         Length:234         Min.   :1.600   Min.   :1999  
##  Class :character   Class :character   1st Qu.:2.400   1st Qu.:1999  
##  Mode  :character   Mode  :character   Median :3.300   Median :2004  
##                                        Mean   :3.472   Mean   :2004  
##                                        3rd Qu.:4.600   3rd Qu.:2008  
##                                        Max.   :7.000   Max.   :2008  
##       cyl           trans               drv                 cty       
##  Min.   :4.000   Length:234         Length:234         Min.   : 9.00  
##  1st Qu.:4.000   Class :character   Class :character   1st Qu.:14.00  
##  Median :6.000   Mode  :character   Mode  :character   Median :17.00  
##  Mean   :5.889                                         Mean   :16.86  
##  3rd Qu.:8.000                                         3rd Qu.:19.00  
##  Max.   :8.000                                         Max.   :35.00  
##       hwy             fl               class          
##  Min.   :12.00   Length:234         Length:234        
##  1st Qu.:18.00   Class :character   Class :character  
##  Median :24.00   Mode  :character   Mode  :character  
##  Mean   :23.44                                        
##  3rd Qu.:27.00                                        
##  Max.   :44.00
library(skimr)
## Warning: package 'skimr' was built under R version 3.5.2
## 
## Attaching package: 'skimr'
## The following object is masked from 'package:stats':
## 
##     filter
skim(mpg)
## Skim summary statistics
##  n obs: 234 
##  n variables: 11 
## 
## ── Variable type:character ──────────────────────────────────────────
##      variable missing complete   n min max empty n_unique
##         class       0      234 234   3  10     0        7
##           drv       0      234 234   1   1     0        3
##            fl       0      234 234   1   1     0        5
##  manufacturer       0      234 234   4  10     0       15
##         model       0      234 234   2  22     0       38
##         trans       0      234 234   8  10     0       10
## 
## ── Variable type:integer ────────────────────────────────────────────
##  variable missing complete   n    mean   sd   p0  p25    p50  p75 p100
##       cty       0      234 234   16.86 4.26    9   14   17     19   35
##       cyl       0      234 234    5.89 1.61    4    4    6      8    8
##       hwy       0      234 234   23.44 5.95   12   18   24     27   44
##      year       0      234 234 2003.5  4.51 1999 1999 2003.5 2008 2008
##      hist
##  ▅▇▇▇▁▁▁▁
##  ▇▁▁▇▁▁▁▇
##  ▃▇▃▇▅▁▁▁
##  ▇▁▁▁▁▁▁▇
## 
## ── Variable type:numeric ────────────────────────────────────────────
##  variable missing complete   n mean   sd  p0 p25 p50 p75 p100     hist
##     displ       0      234 234 3.47 1.29 1.6 2.4 3.3 4.6    7 ▇▇▅▅▅▃▂▁